Prerequisites

Load required packages

library(tidyverse)
library(ggplot2)
library(rtweet)
library(readr)

Dataset

Import processed data, which can be found here.

#read preprocessed data
wines <- read.csv(file = '../data/processed_data/wines.csv')

Get sample of dataset

#set seed value to birthday of Ricardo Rodriguez, American wrestler and ring announcer and Dr. Reinaldo (Rei) Sanchez-Arias
set.seed(19630217)

#set percentage to test with for simplicity, if needed
percentage <- 5
wine_sample<- sample_n(wines, percentage/100*nrow(wines))

Split Taster data into different Data Frame

tasters <- wines %>%
  select(taster_name, taster_twitter_handle) %>% unique()
tasters

Drop taster_twitter_handle in wines dataframe

wines <- wines %>%
  select(-taster_twitter_handle)
head(wines)

Add Reviewer profile info

Each reviewer has there own bias. To offset that we made a “profile” for each reviewer which includes characteristics like: avg_points, sd_points, and var_points

taster_rating_profile <- wines %>%
  group_by(taster_name) %>%
  summarize(
    avg_points = mean(points),
    sd_points = sd(points),
    var_points = var(points)
  )

tasters <- inner_join(tasters, taster_rating_profile, by = "taster_name")
head(tasters)

Add Rating Classification

Add following classification to wine dataset as found on the website:

Category Rating Description
Classic 98-100 The pinnacle of quality.
Superb 94-97 A great achievement.
Excellent 90-93 Highly recommended.
Very Good 87-89 Often good value; well recommended.
Good 83-86 Suitable for everyday consumption; often good value.
Acceptable 80-82 Can be employed in casual, less-critical circumstances
# function to add rating
rating_category <- function(points){
  if(points>=98){
    return("Classic")
  }
  else if (points>=94){
    return("Superb")
  }
  else if(points>=90){
    return("Excellent")
  }
  else if(points>=87){
    return("Very Good")
  }
  else if(points>=83){
    return("Good")
  }
  else{
    return("Acceptable")
  }
}

wines<- wines %>%
  rowwise() %>%
  mutate(rating_category = rating_category(points))
head(wines)

Add Adjusted Points

Since, each reviewer has a different bias we created a normalized metric, norm_points, by looking at the number of standard deviatioins a wine is from the reviewer’s avg_points. This gives use a more accurate representation of which which wines are better than the rest.

normalize_points <- function(points, taster_name){
  t <- tasters %>%
    filter(taster_name == taster_name)
  nrow(t)
  #return((points-t$avg_points)/t$sd_points)
}

wine_sample %>%
  rowwise() %>%
  mutate(norm_points = normalize_points(points, taster_name))

Data Exploration

Univariate Exploration

Correlation price by points, using DataExplorer library which can be found here

# TODO: IZZY
# TODO: OASKI (This is not producing correct results)
wines %>%
    summarize(avg_price = mean(price, na.rm=TRUE), 
              sd_price = sd(price, na.rm=TRUE),
              lowest_price = min(price, na.rm=TRUE),
              highest_price = max(price,na.rm=TRUE))
# TODO: OASKI (This is not producing correct results)
wines %>%
    summarize(avg_points = mean(points, na.rm=TRUE), 
              sd_points = sd(points, na.rm=TRUE),
              lowest_points = min(points, na.rm=TRUE),
              highest_points = max(points,na.rm=TRUE))

Price by Points

Notice the data is “stacked” and the socres range from 80-100

wines %>% 
  ggplot() +
  geom_point(mapping = (aes(x = points, y = price)), na.rm = T, alpha = 0.15) +
  labs(title = "Price by Points", x = "Points", y = "Price")

TODO: IZZY (Why did we log this?)

wines %>% 
  ggplot() +
  geom_point(mapping = (aes(x = points, y = log(price))), na.rm = T, alpha = 0.15) +
  labs(title = "log(Price) by Points", x = "Points", y = "log(Price)")

Data Analysis

#Find the best province for wine using the average points across the 1,000 samples #drop the descriptions or just select price? set points to max(points)

best_province <- wine_sample %>% 
  group_by(province, points) %>% 
  filter(points > 88.669)
best_province  

Rating distribution

Best wine, by variety

#wine_best_variety <- 
wines %>% 
  group_by(variety) %>% 
  summarise(mean_points = mean(points)) %>% 
  arrange(desc(mean_points)) 
  
user_price <- readline(prompt = "How much are you willing to spend on a bottle?")
user_price <- as.integer(user_price)

wines %>% 
  filter(price <= user_price) %>% 
  arrange(desc(points)) %>% 
  select(title, price, points)

Conclusion

LS0tCnRpdGxlOiAiRXhwbG9yaW5nIGFuZCBBbmFseWl6aW5nIFdpbmUgRW50aHVzaWFzdCBSZXZpZXdzIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgojIFByZXJlcXVpc2l0ZXMKCkxvYWQgcmVxdWlyZWQgcGFja2FnZXMKYGBge3IsIG1lc3NhZ2U9RkFMU0UsIHdhcm5pbmc9RkFMU0V9CmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KGRwbHlyKQpsaWJyYXJ5KGdncGxvdDIpCmxpYnJhcnkocnR3ZWV0KQpsaWJyYXJ5KHJlYWRyKQpgYGAKCiMgRGF0YXNldAoKSW1wb3J0IHByb2Nlc3NlZCBkYXRhLCB3aGljaCBjYW4gYmUgZm91bmQgW2hlcmVdKGh0dHBzOi8vZ2l0aHViLmNvbS9DNHJieW4zbTRuL3dpbmVfcmV2aWV3c19kYXRhX2FuYWx5c2lzL2Jsb2IvbWFzdGVyL2RhdGEvcHJvY2Vzc2VkX2RhdGEvcHJlcHJvY2Vzc2luZy5ybWQpLgoKYGBge3J9CiNyZWFkIHByZXByb2Nlc3NlZCBkYXRhCndpbmVzIDwtIHJlYWQuY3N2KGZpbGUgPSAnLi4vZGF0YS9wcm9jZXNzZWRfZGF0YS93aW5lcy5jc3YnKQpgYGAKCkdldCBzYW1wbGUgb2YgZGF0YXNldApgYGB7cn0KI3NldCBzZWVkIHZhbHVlIHRvIGJpcnRoZGF5IG9mIFJpY2FyZG8gUm9kcmlndWV6LCBBbWVyaWNhbiB3cmVzdGxlciBhbmQgcmluZyBhbm5vdW5jZXIgYW5kIERyLiBSZWluYWxkbyAoUmVpKSBTYW5jaGV6LUFyaWFzCnNldC5zZWVkKDE5NjMwMjE3KQoKI3NldCBwZXJjZW50YWdlIHRvIHRlc3Qgd2l0aCBmb3Igc2ltcGxpY2l0eSwgaWYgbmVlZGVkCnBlcmNlbnRhZ2UgPC0gNQp3aW5lX3NhbXBsZTwtIHNhbXBsZV9uKHdpbmVzLCBwZXJjZW50YWdlLzEwMCpucm93KHdpbmVzKSkKYGBgCgojIyMgU3BsaXQgVGFzdGVyIGRhdGEgaW50byBkaWZmZXJlbnQgRGF0YSBGcmFtZQoKYGBge3J9CnRhc3RlcnMgPC0gd2luZXMgJT4lCiAgc2VsZWN0KHRhc3Rlcl9uYW1lLCB0YXN0ZXJfdHdpdHRlcl9oYW5kbGUpICU+JSB1bmlxdWUoKQp0YXN0ZXJzCmBgYAoKRHJvcCBgdGFzdGVyX3R3aXR0ZXJfaGFuZGxlYCBpbiB3aW5lcyBkYXRhZnJhbWUKCmBgYHtyfQp3aW5lcyA8LSB3aW5lcyAlPiUKICBzZWxlY3QoLXRhc3Rlcl90d2l0dGVyX2hhbmRsZSkKaGVhZCh3aW5lcykKYGBgCiMjIEFkZCBSZXZpZXdlciBwcm9maWxlIGluZm8KCkVhY2ggcmV2aWV3ZXIgaGFzIHRoZXJlIG93biBiaWFzLiBUbyBvZmZzZXQgdGhhdCB3ZSBtYWRlIGEgInByb2ZpbGUiIGZvciBlYWNoIHJldmlld2VyIHdoaWNoIGluY2x1ZGVzIGNoYXJhY3RlcmlzdGljcyBsaWtlOiBgYXZnX3BvaW50c2AsIGBzZF9wb2ludHNgLCBhbmQgYHZhcl9wb2ludHNgCmBgYHtyfQp0YXN0ZXJfcmF0aW5nX3Byb2ZpbGUgPC0gd2luZXMgJT4lCiAgZ3JvdXBfYnkodGFzdGVyX25hbWUpICU+JQogIHN1bW1hcml6ZSgKICAgIGF2Z19wb2ludHMgPSBtZWFuKHBvaW50cyksCiAgICBzZF9wb2ludHMgPSBzZChwb2ludHMpLAogICAgdmFyX3BvaW50cyA9IHZhcihwb2ludHMpCiAgKQoKdGFzdGVycyA8LSBpbm5lcl9qb2luKHRhc3RlcnMsIHRhc3Rlcl9yYXRpbmdfcHJvZmlsZSwgYnkgPSAidGFzdGVyX25hbWUiKQpoZWFkKHRhc3RlcnMpCmBgYAojIyMgQWRkIFJhdGluZyBDbGFzc2lmaWNhdGlvbgoKQWRkIGZvbGxvd2luZyBjbGFzc2lmaWNhdGlvbiB0byB3aW5lIGRhdGFzZXQgYXMgZm91bmQgb24gdGhlIFt3ZWJzaXRlXShodHRwczovL3d3dy53aW5lbWFnLmNvbS8yMDEwLzA0LzA5L3lvdS1hc2tlZC1ob3ctaXMtYS13aW5lcy1zY29yZS1kZXRlcm1pbmVkLyk6Cgp8Q2F0ZWdvcnkgIHwgUmF0aW5nICB8IERlc2NyaXB0aW9uICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICB8CnwtLS0tLS0tLS0tfC0tLS0tLS0tLXwtLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLXwKfENsYXNzaWMgICB8CTk4LTEwMCB8IFRoZSBwaW5uYWNsZSBvZiBxdWFsaXR5LiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICB8CnxTdXBlcmIgICAgfAk5NC05NwkgfCBBIGdyZWF0IGFjaGlldmVtZW50LiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgfAp8RXhjZWxsZW50IHwJOTAtOTMJIHwgSGlnaGx5IHJlY29tbWVuZGVkLiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHwKfFZlcnkgR29vZCB8ICA4Ny04OQkgfCBPZnRlbiBnb29kIHZhbHVlOyB3ZWxsIHJlY29tbWVuZGVkLiAgICAgICAgICAgICAgICAgICAgfAp8R29vZAkgICAgIHwgIDgzLTg2CSB8IFN1aXRhYmxlIGZvciBldmVyeWRheSBjb25zdW1wdGlvbjsgb2Z0ZW4gZ29vZCB2YWx1ZS4gICB8CnxBY2NlcHRhYmxlfAk4MC04MgkgfCBDYW4gYmUgZW1wbG95ZWQgaW4gY2FzdWFsLCBsZXNzLWNyaXRpY2FsIGNpcmN1bXN0YW5jZXMgfAoKYGBge3J9CiMgZnVuY3Rpb24gdG8gYWRkIHJhdGluZwpyYXRpbmdfY2F0ZWdvcnkgPC0gZnVuY3Rpb24ocG9pbnRzKXsKICBpZihwb2ludHM+PTk4KXsKICAgIHJldHVybigiQ2xhc3NpYyIpCiAgfQogIGVsc2UgaWYgKHBvaW50cz49OTQpewogICAgcmV0dXJuKCJTdXBlcmIiKQogIH0KICBlbHNlIGlmKHBvaW50cz49OTApewogICAgcmV0dXJuKCJFeGNlbGxlbnQiKQogIH0KICBlbHNlIGlmKHBvaW50cz49ODcpewogICAgcmV0dXJuKCJWZXJ5IEdvb2QiKQogIH0KICBlbHNlIGlmKHBvaW50cz49ODMpewogICAgcmV0dXJuKCJHb29kIikKICB9CiAgZWxzZXsKICAgIHJldHVybigiQWNjZXB0YWJsZSIpCiAgfQp9Cgp3aW5lczwtIHdpbmVzICU+JQogIHJvd3dpc2UoKSAlPiUKICBtdXRhdGUocmF0aW5nX2NhdGVnb3J5ID0gcmF0aW5nX2NhdGVnb3J5KHBvaW50cykpCmhlYWQod2luZXMpCmBgYAoKIyMgQWRkIEFkanVzdGVkIFBvaW50cwoKU2luY2UsIGVhY2ggcmV2aWV3ZXIgaGFzIGEgZGlmZmVyZW50IGJpYXMgd2UgY3JlYXRlZCBhIG5vcm1hbGl6ZWQgbWV0cmljLCBgbm9ybV9wb2ludHNgLCBieSBsb29raW5nIGF0IHRoZSBudW1iZXIgb2Ygc3RhbmRhcmQgZGV2aWF0aW9pbnMgYSB3aW5lIGlzIGZyb20gdGhlIHJldmlld2VyJ3MgYGF2Z19wb2ludHNgLiBUaGlzIGdpdmVzIHVzZSBhIG1vcmUgYWNjdXJhdGUgcmVwcmVzZW50YXRpb24gb2Ygd2hpY2ggd2hpY2ggd2luZXMgYXJlIGJldHRlciB0aGFuIHRoZSByZXN0LgoKYGBge3J9Cm5vcm1hbGl6ZV9wb2ludHMgPC0gZnVuY3Rpb24ocG9pbnRzLCBuYW1lKXsKICB0IDwtIHRhc3RlcnMgJT4lCiAgICBmaWx0ZXIodGFzdGVyX25hbWUgPT0gbmFtZSkKICBucm93KHQpCiAgI3JldHVybigocG9pbnRzLXQkYXZnX3BvaW50cykvdCRzZF9wb2ludHMpCn0KCndpbmVfc2FtcGxlICU+JQogIHJvd3dpc2UoKSAlPiUKICBtdXRhdGUobm9ybV9wb2ludHMgPSBub3JtYWxpemVfcG9pbnRzKHBvaW50cywgdGFzdGVyX25hbWUpKQpgYGAKCiMgRGF0YSBFeHBsb3JhdGlvbgoKIyMgVW5pdmFyaWF0ZSBFeHBsb3JhdGlvbgpDb3JyZWxhdGlvbiBgcHJpY2VgIGJ5IGBwb2ludHNgLCB1c2luZyBgYGBEYXRhRXhwbG9yZXJgYGAgbGlicmFyeSB3aGljaCBjYW4gYmUgZm91bmQgW2hlcmVdKGh0dHBzOi8vZGF0YXNjaWVuY2VwbHVzLmNvbS9ibGF6aW5nLWZhc3QtZWRhLWluLXItd2l0aC1kYXRhZXhwbG9yZXIvKQpgYGB7cn0KIyBUT0RPOiBJWlpZCmBgYAoKYGBge3J9CiMgVE9ETzogT0FTS0kgKFRoaXMgaXMgbm90IHByb2R1Y2luZyBjb3JyZWN0IHJlc3VsdHMpCndpbmVzICU+JQogICAgc3VtbWFyaXplKGF2Z19wcmljZSA9IG1lYW4ocHJpY2UsIG5hLnJtPVRSVUUpLCAKICAgICAgICAgICAgICBzZF9wcmljZSA9IHNkKHByaWNlLCBuYS5ybT1UUlVFKSwKICAgICAgICAgICAgICBsb3dlc3RfcHJpY2UgPSBtaW4ocHJpY2UsIG5hLnJtPVRSVUUpLAogICAgICAgICAgICAgIGhpZ2hlc3RfcHJpY2UgPSBtYXgocHJpY2UsbmEucm09VFJVRSkpCmBgYAoKYGBge3J9CiMgVE9ETzogT0FTS0kgKFRoaXMgaXMgbm90IHByb2R1Y2luZyBjb3JyZWN0IHJlc3VsdHMpCndpbmVzICU+JQogICAgc3VtbWFyaXplKGF2Z19wb2ludHMgPSBtZWFuKHBvaW50cywgbmEucm09VFJVRSksIAogICAgICAgICAgICAgIHNkX3BvaW50cyA9IHNkKHBvaW50cywgbmEucm09VFJVRSksCiAgICAgICAgICAgICAgbG93ZXN0X3BvaW50cyA9IG1pbihwb2ludHMsIG5hLnJtPVRSVUUpLAogICAgICAgICAgICAgIGhpZ2hlc3RfcG9pbnRzID0gbWF4KHBvaW50cyxuYS5ybT1UUlVFKSkKYGBgCgojIyBQcmljZSBieSBQb2ludHMKTm90aWNlIHRoZSBkYXRhIGlzICJzdGFja2VkIiBhbmQgdGhlIHNvY3JlcyByYW5nZSBmcm9tIDgwLTEwMApgYGB7cn0Kd2luZXMgJT4lIAogIGdncGxvdCgpICsKICBnZW9tX3BvaW50KG1hcHBpbmcgPSAoYWVzKHggPSBwb2ludHMsIHkgPSBwcmljZSkpLCBuYS5ybSA9IFQsIGFscGhhID0gMC4xNSkgKwogIGxhYnModGl0bGUgPSAiUHJpY2UgYnkgUG9pbnRzIiwgeCA9ICJQb2ludHMiLCB5ID0gIlByaWNlIikKYGBgCgpUT0RPOiBJWlpZIChXaHkgZGlkIHdlIGxvZyB0aGlzPykKCmBgYHtyfQp3aW5lcyAlPiUgCiAgZ2dwbG90KCkgKwogIGdlb21fcG9pbnQobWFwcGluZyA9IChhZXMoeCA9IHBvaW50cywgeSA9IGxvZyhwcmljZSkpKSwgbmEucm0gPSBULCBhbHBoYSA9IDAuMTUpICsKICBsYWJzKHRpdGxlID0gImxvZyhQcmljZSkgYnkgUG9pbnRzIiwgeCA9ICJQb2ludHMiLCB5ID0gImxvZyhQcmljZSkiKQpgYGAKCiMgRGF0YSBBbmFseXNpcwoKI0ZpbmQgdGhlIGJlc3QgcHJvdmluY2UgZm9yIHdpbmUgdXNpbmcgdGhlIGF2ZXJhZ2UgcG9pbnRzIGFjcm9zcyB0aGUgMSwwMDAgc2FtcGxlcwojZHJvcCB0aGUgZGVzY3JpcHRpb25zIG9yIGp1c3Qgc2VsZWN0IHByaWNlPyBzZXQgcG9pbnRzIHRvIG1heChwb2ludHMpCmBgYHtyfQpiZXN0X3Byb3ZpbmNlIDwtIHdpbmVfc2FtcGxlICU+JSAKICBncm91cF9ieShwcm92aW5jZSwgcG9pbnRzKSAlPiUgCiAgZmlsdGVyKHBvaW50cyA+IDg4LjY2OSkKYmVzdF9wcm92aW5jZSAgCmBgYAoKClJhdGluZyBkaXN0cmlidXRpb24KCmBgYHtyfQoKYGBgCgpCZXN0IHdpbmUsIGJ5IHZhcmlldHkKYGBge3J9CiN3aW5lX2Jlc3RfdmFyaWV0eSA8LSAKd2luZXMgJT4lIAogIGdyb3VwX2J5KHZhcmlldHkpICU+JSAKICBzdW1tYXJpc2UobWVhbl9wb2ludHMgPSBtZWFuKHBvaW50cykpICU+JSAKICBhcnJhbmdlKGRlc2MobWVhbl9wb2ludHMpKSAKICAKYGBgCgpgYGB7cn0KdXNlcl9wcmljZSA8LSByZWFkbGluZShwcm9tcHQgPSAiSG93IG11Y2ggYXJlIHlvdSB3aWxsaW5nIHRvIHNwZW5kIG9uIGEgYm90dGxlPyIpCnVzZXJfcHJpY2UgPC0gYXMuaW50ZWdlcih1c2VyX3ByaWNlKQoKd2luZXMgJT4lIAogIGZpbHRlcihwcmljZSA8PSB1c2VyX3ByaWNlKSAlPiUgCiAgYXJyYW5nZShkZXNjKHBvaW50cykpICU+JSAKICBzZWxlY3QodGl0bGUsIHByaWNlLCBwb2ludHMpCmBgYAoKCiMgQ29uY2x1c2lvbgo=